This analysis is using the RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR vignette (http://129.217.206.11/packages/3.9/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html) to pre-process data and conduct differential gene expression analysis between Stage IA lung cancer patients with lung adenocarcinoma and lung squamous cell carcinoma.
This dataset of 40 lung cancer patients were downloaded from GDC data portal with GDC RNASeq Tool (https://github.com/cpreid2/gdc-rnaseq-tool) and then processed by MergedCount_processing_adeno.ipynb python script before loading in to R for analysis.
In this step, all necessary packages are loaded and working directory is set so that all the files can be loaded into the R Studio environment.
library(limma)
library(Glimma)
library(edgeR)
library(dplyr)
library(magrittr)
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "~/Desktop/USC/Fall_2020/TRGN_510/510_FinalProject")
The genecount, clinical metadata and exposure data are read in here. After reading in the clincial metadata and exposure data, they are merged together so that we only have to deal with one dataframe of metadata. Before merging, duplicated samples need to be removed so that we have one entry for each of the patients and the row number of the metadata can match the column number of the genecount data. The diagnosis variable in metadata is collapsed into only two categories: Adenocarcinoma and Squamous cell carcinoma, so that these two group can be compared against each other in differential gene expression analysis later on.
# Reading in count data
count_matrix = read.table(file = 'data/count_processed.txt',sep = '\t',header = T,stringsAsFactors = F, check.names = F, row.names = 1)
count_matrix = count_matrix[,order(names(count_matrix))]
#Reading in clinical metadata
clinical_metadata = read.table(file = 'data/clinical_processed.txt', sep = '\t', header = T, as.is = T, check.names = F)
clinical_metadata = clinical_metadata[!duplicated(clinical_metadata[ , c("case_submitter_id")]),]
# Reading in exposure metadata
exposure_metadata = read.csv(file = 'data/exposure_processed.txt', sep = '\t', header = T, as.is = T, check.names = F)
# Merging metadata
all_metadata = merge(x = clinical_metadata, y = exposure_metadata, by = 'case_submitter_id')
all_metadata$ethnicity <- as.factor(all_metadata$ethnicity)
all_metadata$gender <- as.factor(all_metadata$gender)
all_metadata%<>%
mutate(diagnosis=case_when(
primary_diagnosis %in% c("Adenocarcinoma with mixed subtypes","Adenocarcinoma, NOS","Bronchiolo-alveolar adenocarcinoma, NOS","Papillary adenocarcinoma, NOS") ~ c("Adenocarcinoma"),
primary_diagnosis %in% c("Squamous cell carcinoma, large cell, nonkeratinizing, NOS","Squamous cell carcinoma, NOS") ~ c("Squamous_cell_carcinoma")
))
all_metadata$diagnosis = as.factor(all_metadata$diagnosis)
all_metadata$race <- as.factor(all_metadata$race)
rownames(all_metadata) <- all_metadata$case_submitter_id
all_metadata = all_metadata[,-1]
all_metadata = all_metadata[order(rownames(all_metadata)),]
DGEList Object is created with genecount data and metadata from the previous step. By reading in the genecode reference file from GDC data portal, I am able to map ensembl IDs in the gene count data to their corresponding gene names. Gene name annotation will be availble to us in the analysis later on. It’s easier to read gene names!
# Creating DGEList Object
geneExpr = DGEList(counts = count_matrix, samples = all_metadata)
geneExpr$samples$group=all_metadata$diagnosis
geneid = rownames(geneExpr)
# Importing genecode reference to map annotation from DGC website
gencode = read.table('data/gencode.gene.info.v22.tsv',sep = '\t',header = T,stringsAsFactors = F, check.names = F)
genes = gencode[geneid %in% gencode$gene_id, ]
genes = genes[,c("gene_id","gene_name","seqname")]
rownames(genes) = genes$gene_id
genes = genes[order(genes$gene_id),]
geneExpr$genes = genes[,-1]
Because not all samples is seequenced with the same depth, we always like to know the average library size in differential expression analysis to compare all the samples equally. Here we estimated average library size by calculating CPM.
cpm <- cpm(geneExpr)
lcpm <- cpm(geneExpr, log=TRUE)
L <- mean(geneExpr$samples$lib.size) * 1e-6
M <- median(geneExpr$samples$lib.size) * 1e-6
c(L,M)
[1] 52.92581 48.81745
The average library size for this dataset is about 51.9 million.
In RNA-sequencing data, it is very common to have genes that are not expressed. In order to investigate only genes that are differentially expressed between lung adenocarcinoma patients and lung squamous cell carcinoma patient, genes that are not expressed in all 40 samples are removed.
table(rowSums(geneExpr$counts==0)==40)
FALSE TRUE
53740 6743
keep.exprs <- filterByExpr(geneExpr, group=geneExpr$samples$group)
Around 11% of genes in the dataset have zero counts across all 40 samples.
geneExpr <- geneExpr[keep.exprs,, keep.lib.sizes=FALSE]
dim(geneExpr)
[1] 21170 40
In this dataset, the median library size is 48.8 million and 10/48.8 is about 0.2, therefore the filterByExpr function keeps genes that have a CPM of 0.2 or more. With this cutoff the number of genes are reduced to 21170, about 35% of genes from what I started with.
lcpm.cutoff <- log2(10/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(geneExpr)
col <- brewer.pal(nsamples, "Paired")
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.7), las=2, main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", colnames(geneExpr$counts), text.col=col, bty="n")
lcpm_filtered <- cpm(geneExpr, log=TRUE)
plot(density(lcpm_filtered[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm_filtered[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", colnames(geneExpr$counts), text.col=col, bty="n")
This graph showed that before filtering the data, a large portion of genes within each samples are lowly-expressed with log-CPM values that are small or negative. After removing the lowly expressed genes (B), a large portion of genes in each sample pass the log-CPM threshold.
Normalizing genecount data can normalize the effects of non-biological external factors (e.g. sequencing batch, technicians etc.). So that the final differential expression results will probably be due to biological difference between groups.
geneExpr = calcNormFactors(geneExpr, method = "TMM")
geneExpr$samples$norm.factors
[1] 1.0391573 0.9244982 0.8856329 1.0521559 1.0145085 1.1196887 1.0727778 1.3600421 1.0171641 0.9386345 0.8890663 0.6747847 0.9553945 1.1161617
[15] 0.9624960 1.1869816 1.0062698 0.8520753 0.8883963 1.0043621 0.8191521 1.0129474 0.9248736 1.1260210 1.0694870 0.7173413 1.0284701 1.0027376
[29] 1.0325592 1.1019957 1.0554783 0.9856847 1.0846584 1.0126931 1.1959792 0.9179716 1.1264297 1.0857329 1.0489496 1.0082920
geneExpr_unnormalized <- geneExpr
geneExpr_unnormalized$samples$norm.factors <- 1
par(mfrow=c(1,2))
lcpm_unnormalized <- cpm(geneExpr_unnormalized, log=TRUE)
boxplot(lcpm_unnormalized, las=2, col=col, main="")
title(main="A. Example: Unnormalised data",ylab="Log-cpm")
geneExpr_normalized <- calcNormFactors(geneExpr_unnormalized)
geneExpr_normalized$samples$norm.factors
[1] 1.0391573 0.9244982 0.8856329 1.0521559 1.0145085 1.1196887 1.0727778 1.3600421 1.0171641 0.9386345 0.8890663 0.6747847 0.9553945 1.1161617
[15] 0.9624960 1.1869816 1.0062698 0.8520753 0.8883963 1.0043621 0.8191521 1.0129474 0.9248736 1.1260210 1.0694870 0.7173413 1.0284701 1.0027376
[29] 1.0325592 1.1019957 1.0554783 0.9856847 1.0846584 1.0126931 1.1959792 0.9179716 1.1264297 1.0857329 1.0489496 1.0082920
lcpm_normalized <- cpm(geneExpr_normalized, log=TRUE)
boxplot(lcpm_normalized, las=2, col=col, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
This figure showed that after normalization samples are more similar to each other in term of gene count.
lcpm <- cpm(geneExpr, log=TRUE)
par(mfrow=c(1,2))
col.group <- geneExpr$samples$group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)
col.race <- geneExpr$samples$race
levels(col.race) <- brewer.pal(nlevels(col.race), "Set2")
col.race <- as.character(col.race)
col.gender <- geneExpr$samples$gender
levels(col.gender) <- brewer.pal(nlevels(col.gender), "Set3")
col.gender <- as.character(col.gender)
col.age <- geneExpr$samples$age_at_diagnosis
levels(col.age) <- brewer.pal(nlevels(col.age), "Set1")
col.age <- as.character(col.age)
col.ethnicity <- geneExpr$samples$ethnicity
levels(col.ethnicity) <- brewer.pal(nlevels(col.ethnicity), "Set2")
col.ethnicity <- as.character(col.ethnicity)
col.cigar_per_day <- geneExpr$samples$cigarettes_per_day
levels(col.cigar_per_day) <- brewer.pal(nlevels(col.cigar_per_day), "Set3")
col.cigar_per_day <- as.character(col.cigar_per_day)
plotMDS(lcpm,labels=geneExpr$samples$group, col=col.group)
title(main="A. Sample groups")
plotMDS(lcpm, labels=geneExpr$samples$gender, col=col.gender, dim=c(3,4))
title(main="B. Sex")
plotMDS(lcpm, labels=geneExpr$samples$age_at_diagnosis, col=col.age, dim=c(3,4))
title(main="C. Age at diagnosis")
plotMDS(lcpm, labels=geneExpr$samples$race, col=col.race, dim=c(3,4))
title(main="D. Race")
plotMDS(lcpm, labels=geneExpr$samples$ethnicity, col=col.ethnicity, dim=c(3,4))
title(main="E. Ethnicity")
plotMDS(lcpm, labels=geneExpr$samples$cigarettes_per_day, col=col.cigar_per_day, dim=c(3,4))
title(main="F. Cigarettes per day")
In the MDS plot, there is some separation between patients diagnosed with Adenocarcinoma and Squamous cell carcinoma. You can also see male and female patients are segregating together. However, you cannot see any distinct clusters based on Age at diagnosis, Race, Ethnicity and Cigarette per day. Therefore when we construct the design matrix, we will take sex into account and not the other variables.
Design matrix is created by including variables of interest (lung cancer diagnosis) and variable (sex) that can cause an effect based on the results from MDS plots above. A contrast matrix is setting up comparison between two conditions (Adenocarcinoma vs Squamous cell carcinoma, considering effects of sex).
diagnosis = geneExpr$samples$group
ethnicity = geneExpr$samples$ethnicity
sex = geneExpr$samples$gender
race = geneExpr$samples$race
age_at_diagnosis = geneExpr$samples$age_at_diagnosis
smoking_hist = geneExpr$samples$cigarettes_per_day
design = model.matrix(~0+diagnosis+sex)
colnames(design) <- gsub("diagnosis", "", colnames(design))
contr.matrix <- makeContrasts(
AdenocarcinomavsSquamous_cell_carcinoma = Adenocarcinoma-Squamous_cell_carcinoma,
levels = colnames(design))
contr.matrix
Contrasts
Levels AdenocarcinomavsSquamous_cell_carcinoma
Adenocarcinoma 1
Squamous_cell_carcinoma -1
sexmale 0
Voom is performed and linear models are fitted for comparisons between lung adenocarcinoma and lung squamous cell carcinoma. Empirical Bayesian moderation is also performed to obtain more precise estimates of gene-wise comparisons.
par(mfrow=c(1,2))
v <- voom(geneExpr, design, plot=TRUE)
v
An object of class "EList"
$genes
21165 more rows ...
$targets
35 more rows ...
$E
TCGA-22-4613 TCGA-22-5489 TCGA-22-5491 TCGA-33-AAS8 TCGA-34-5231 TCGA-43-7656 TCGA-43-8118 TCGA-44-5645 TCGA-44-A47G
ENSG00000000003.13 6.578393 4.518445 7.050818 5.481583 6.016010 5.138312 5.436772 5.635659 5.702467
ENSG00000000419.11 5.006541 5.125550 6.009261 5.535389 5.227853 6.273798 5.677880 4.093028 4.281239
ENSG00000000457.12 3.899691 4.351443 4.164341 3.347290 4.310104 4.022563 3.425331 5.427188 3.977077
ENSG00000000460.15 3.720341 4.789466 4.070052 3.040155 4.109311 3.920045 3.333184 4.295921 2.528768
ENSG00000000938.11 4.311517 4.032854 2.267258 2.617683 2.708592 3.026093 5.302423 2.653449 5.680948
TCGA-46-3765 TCGA-46-3766 TCGA-49-4486 TCGA-49-4487 TCGA-49-AARO TCGA-50-5942 TCGA-50-5946 TCGA-50-8457 TCGA-50-8460
ENSG00000000003.13 6.113541 6.379925 6.279390 5.972862 6.359283 5.745927 5.480757 5.061664 5.801764
ENSG00000000419.11 6.092018 5.071231 5.475511 5.916025 4.660764 4.466833 5.072669 4.197116 4.935787
ENSG00000000457.12 3.866716 3.928589 4.975262 4.130680 3.932304 4.777173 4.218002 4.293238 3.792598
ENSG00000000460.15 3.742487 2.508825 2.713849 4.464368 2.723976 1.971207 3.939701 2.450756 1.987285
ENSG00000000938.11 3.611814 4.778872 2.166361 5.063277 5.362237 2.672767 1.804396 4.286484 4.685892
TCGA-55-7726 TCGA-55-8089 TCGA-56-5897 TCGA-56-A4ZJ TCGA-56-A5DR TCGA-63-A5MH TCGA-63-A5MY TCGA-64-1676 TCGA-68-8250
ENSG00000000003.13 5.618804 5.648883 4.869300 5.409876 5.176682 5.836250 5.890766 7.721230 6.362591
ENSG00000000419.11 6.016039 5.237966 5.088500 5.088331 5.362201 5.024777 6.699551 6.669445 6.051110
ENSG00000000457.12 3.814772 4.220320 3.935572 3.573813 4.258951 4.356796 4.139408 4.442916 3.510752
ENSG00000000460.15 3.384872 3.679206 3.653911 2.581473 4.020476 4.482014 4.332709 3.469883 3.347400
ENSG00000000938.11 3.749970 5.450806 4.716595 5.280737 2.910954 2.161463 2.534827 4.735432 4.758486
TCGA-73-4662 TCGA-77-A5G7 TCGA-85-8048 TCGA-86-7953 TCGA-86-8076 TCGA-90-7766 TCGA-91-6828 TCGA-93-A4JO TCGA-96-7545
ENSG00000000003.13 6.141768 5.962677 6.170662 5.978361 5.688321 6.386395 5.854894 6.055768 6.086418
ENSG00000000419.11 4.699274 6.046526 5.186354 4.206129 4.600721 5.411248 4.413630 5.220789 5.534683
ENSG00000000457.12 5.172006 4.208445 4.329645 3.843559 4.470898 4.452007 4.641835 4.296636 3.867595
ENSG00000000460.15 4.412238 4.220873 4.220775 4.150741 2.488662 3.854044 2.833918 3.301439 2.842824
ENSG00000000938.11 4.545843 2.619316 5.029770 4.653009 4.916381 3.879929 4.778828 5.055971 4.323060
TCGA-98-A53C TCGA-99-AA5R TCGA-NJ-A4YQ TCGA-O1-A52J
ENSG00000000003.13 4.356866 4.898964 5.288840 5.802251
ENSG00000000419.11 4.524946 4.397344 4.731263 5.020307
ENSG00000000457.12 3.721552 4.155948 4.791851 4.284247
ENSG00000000460.15 2.332691 2.017264 3.217170 3.054791
ENSG00000000938.11 5.647853 6.152786 4.741985 5.868203
21165 more rows ...
$weights
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15]
[1,] 1.9156382 2.265171 2.264350 2.213717 2.251193 2.259541 2.213335 2.134201 2.196382 2.178912 2.103449 2.260762 1.9845015 2.199452 2.147720
[2,] 1.7274124 2.263898 2.259424 2.130467 2.259430 2.249206 2.129639 1.709450 1.843064 2.068303 1.956363 2.159220 1.4994892 1.850504 1.736032
[3,] 1.1247909 1.889080 1.831902 1.519393 2.058846 1.750188 1.518138 1.492255 1.626831 1.428855 1.303977 1.918998 1.3029032 1.634603 1.518237
[4,] 0.9981961 1.764861 1.705691 1.323075 1.957294 1.621630 1.321983 1.007585 1.085743 1.245279 1.141339 1.377043 0.9075725 1.090570 1.022096
[5,] 1.2599568 1.638856 1.578728 1.704841 1.839755 1.497199 1.703511 1.700961 1.835126 1.608547 1.469780 1.696029 1.4916402 1.842526 1.728006
[,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30]
[1,] 2.261012 2.239719 2.192143 1.912564 2.265181 2.254155 2.229398 2.217851 2.221969 2.247060 2.0726890 2.238861 2.258105 2.162523 2.207645
[2,] 2.236428 1.982529 1.936786 1.422262 2.210388 2.240132 2.157465 2.188616 2.194131 2.229313 1.7227708 2.217553 2.217851 2.116843 2.174970
[3,] 2.096214 1.780240 1.608988 1.237064 2.026705 1.700806 1.572091 1.526228 1.539838 1.649835 1.3955026 1.609251 2.124609 1.389669 1.493012
[4,] 1.590228 1.189021 1.130158 0.874648 1.494592 1.571770 1.369460 1.405102 1.417681 1.522205 0.9983615 1.483421 1.549154 1.279861 1.374249
[5,] 1.913948 1.975738 1.388865 1.414733 1.821489 1.450037 1.757968 1.295786 1.307260 1.403482 1.2056881 1.367369 2.215406 1.183066 1.267782
[,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38] [,39] [,40]
[1,] 2.263954 2.264046 2.260432 2.0272364 2.252693 2.259175 2.235016 2.216680 2.171008 2.221055
[2,] 2.125441 2.182022 2.229632 1.6608334 2.118335 2.248536 2.170787 1.901291 1.781816 1.914839
[3,] 1.965746 1.964307 1.768653 1.3415540 1.847571 1.745479 1.598156 1.688464 1.563408 1.703532
[4,] 1.347076 1.422712 1.554026 0.9676036 1.311413 1.616765 1.392875 1.125273 1.047906 1.134973
[5,] 2.120177 1.746381 1.945727 1.1615881 1.619536 1.492648 1.784314 1.893762 1.773524 1.907243
21165 more rows ...
$design
Adenocarcinoma Squamous_cell_carcinoma sexmale
1 0 1 0
2 0 1 1
3 0 1 1
4 0 1 0
5 0 1 1
35 more rows ...
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")
With an adjusted p-value cutoff of 5%, there are 2482 down-regualted genes and 1988 up-regulated differentially expressed gene between lung adenocarcinoma and squamous cell carcinoma patients in early stage.
summary(decideTests(efit))
AdenocarcinomavsSquamous_cell_carcinoma
Down 2482
NotSig 16700
Up 1988
res = topTable(efit,sort.by = "P",n=Inf)
head(res)
tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
summary(dt)
AdenocarcinomavsSquamous_cell_carcinoma
Down 320
NotSig 20769
Up 81
After limiting the LFC threshold to 1, there are less DEGs and we can focus on genes that are the most differentially expressed.
The MD plot below show how many genes are differentially expressed and their direction with an adjusted p-value cutoff of 5% and LFC cutoff of 1. Each dot represents one gene.
plotMD(tfit, column=1, status=dt[,1], main=colnames(tfit)[1],
xlim=c(-8,13))
glMDPlot(tfit, coef=1, status=dt, main=colnames(tfit)[1],
side.main="gene_name", counts=lcpm, groups=geneExpr$samples$group, launch=FALSE)
This generates an interactive MD plot, which provides information for each gene.
The following heatmap show the top 100 genes according to p-value. It also performed an unsupervised clustering of the samples. Based on the dendrogram, almost all samples from two different groups are separated except one squamous cell carcinoma sample found within the adenocarcinoma cluster on the left. This could be a potential outlier, some other tests (such as PCA) need to be performed to evaluate whether it need to be removed. From the heat map, you can see the expression pattern is consistence withino two groups, which is as expected because it is showing top 100 DEGs according to p-value.
library(gplots)
par(mar = rep(2, 4))
Adenocarcinoma.vs.SquamousCell.topgenes <- res$gene_name[1:100]
i <- which(v$genes$gene_name %in% Adenocarcinoma.vs.SquamousCell.topgenes)
mycol <- colorpanel(1000,"blue","white","red")
heatmap.2(lcpm[i,], scale="row",
labRow=v$genes$gene_name[i], labCol=v$targets$group,
col=mycol, trace="none", density.info="none",
margin=c(8,12), lhei=c(2,10), dendrogram="column")
Before using gene set enrichment analysis, we need to map ENTREZ ID to gene name so that it is able to create an index from the c2 genes signature from the Broad Institute’s MSigDB c2 collection (http://bioinf.wehi.edu.au/software/MSigDB/). Couple pathways have a significant p-value but all of them have a very high FDR. Since adenocarcinoma is starting in mucous producing cell in lung, I was hoping to see some mucous related pathways in the gene set enrichment analysis. The EGF response pathway was found to be downregulated in diverse tumor type according to GSEA, which is consistence with my results.
library(Homo.sapiens)
geneid = v$genes$gene_name
genes = select(Homo.sapiens, keys = geneid, columns = "ENTREZID", keytype = "SYMBOL")
genes <- genes[!duplicated(genes$SYMBOL),]
genes_v = v$genes
genes_v$ensembl = rownames(genes_v)
genes_v_new = merge(genes_v,genes,by.x="gene_name",by.y="SYMBOL",sort=F)
rownames(genes_v_new) = genes_v_new$ensembl
genes_v_new = genes_v_new[,-3]
v$genes = genes_v_new
load("data/human_c2_v5p2.rdata")
idx <- ids2indices(Hs.c2,id=v$genes$ENTREZID)
cam.AdenovsSqua <- camera(v,idx,design,contrast = contr.matrix[,1])
head(cam.AdenovsSqua,20)
NA
barcodeplot(efit$t[,1], index=idx$AMIT_EGF_RESPONSE_20_MCF10A,
index2=idx$DORN_ADENOVIRUS_INFECTION_24HR_DN, main="Adenocarcinoma Vs Squamous Cell Carcinoma")
This graph is showing relative enrichment for Adenocarcinoma vs Squamous cell carcinoma in lung cancer.